// #define max(a, b) ((a) > (b) ? (a):(b))
// #define min(a, b) ((a) < (b) ? (a):(b))
#ifdef __sw_slave__
void CAT(pack_brick, SUFFIX)(vec_pack_param_t *pm) {
  int altid = _COL * 8 + _ROW;
  if (altid >= NPE_PACK) return;
  dma_init();
  vec_pack_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(vec_pack_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  dma_syn();
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;

  int nall = xlen * ylen * zlen;
  int npack_pe = (nall + NPE_PACK - 1) / NPE_PACK;
  int pack_st = npack_pe * altid;
  int pack_ed = min(npack_pe * (altid + 1), nall);
  char *ptr = lpm.buf + lpm.offset[altid];
  for (int i = lpm.xlo; i < lpm.xhi; i ++) {
    for (int j = lpm.ylo; j < lpm.yhi; j ++) {
      for (int k = lpm.zlo; k < lpm.zhi; k ++) {
        int idx = ((i - lpm.xlo) * ylen + j - lpm.ylo) * zlen + k - lpm.zlo;
        if (idx < pack_st || idx >= pack_ed) continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += CAT(pack_cell, SUFFIX)(ptr, cell);
      }
    }
  }
}

void CAT(unpack_brick, SUFFIX)(vec_pack_param_t *pm) {
  int altid = _COL * 8 + _ROW;
  if (altid >= NPE_PACK) return;
  // if (altid != 1) return;
  dma_init();
  vec_pack_param_t lpm;
  cellgrid_t lgrid;
  pe_get(pm, &lpm, sizeof(vec_pack_param_t));
  dma_syn();
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  dma_syn();
  int xlen = lpm.xhi - lpm.xlo;
  int ylen = lpm.yhi - lpm.ylo;
  int zlen = lpm.zhi - lpm.zlo;

  int nall = xlen * ylen * zlen;
  int npack_pe = (nall + NPE_PACK - 1) / NPE_PACK;
  int pack_st = npack_pe * altid;
  int pack_ed = min(npack_pe * (altid + 1), nall);
  char *ptr = lpm.buf + lpm.offset[altid];
  for (int i = lpm.xlo; i < lpm.xhi; i ++) {
    for (int j = lpm.ylo; j < lpm.yhi; j ++) {
      for (int k = lpm.zlo; k < lpm.zhi; k ++) {
        int idx = ((i - lpm.xlo) * ylen + j - lpm.ylo) * zlen + k - lpm.zlo;
        if (idx < pack_st || idx >= pack_ed) continue;
        celldata_t *cell = get_cell_xyz(&lgrid, i, j, k);
        ptr += CAT(unpack_cell, SUFFIX)(ptr, cell);
      }
    }
  }
}
#endif
#ifdef __sw_host__
extern void CAT(slave_pack_brick, SUFFIX)(vec_pack_param_t *);
extern void CAT(slave_unpack_brick, SUFFIX)(vec_pack_param_t *);
// size_t CAT(pack_brick_sw, SUFFIX)(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
//   vec_pack_param_t pm;
//   pm.grid = grid;
//   pm.buf = buf;
//   pm.xlo = xlo;
//   pm.xhi = xhi;
//   pm.ylo = ylo;
//   pm.yhi = yhi;
//   pm.zlo = zlo;
//   pm.zhi = zhi;
//   int xlen = xhi - xlo;
//   int ylen = yhi - ylo;
//   int zlen = zhi - zlo;

//   int npack_pe = (xlen * ylen * zlen + NPE_PACK - 1) / NPE_PACK;
//   div_magic_t div_npack_magic;
//   make_magic(&div_npack_magic, npack_pe);
//   size_t offset = 0;
//   int mod = 0, div = 0;
//   for (int i = xlo; i < xhi; i ++) {
//     for (int j = ylo; j < yhi; j ++) {
//       for (int k = zlo; k < zhi; k ++) {
//         int idx = ((i - xlo) * ylen + j - ylo) * zlen + k - zlo;
//         int div = MAGIC_DIV(idx, div_npack_magic);
//         int mod = idx - div * npack_pe;
//         if (mod == 0) {
//           pm.offset[div] = offset;
//         }
//         celldata_t *cell = get_cell_xyz(grid, i, j, k);
//         size_t cell_size = CAT(estimate, SUFFIX)(cell);
//         offset += cell_size;
//       }
//     }
//   }
//   // printf("pack %d %d %d %d %d %d\n", pm.xlo, pm.xhi, pm.ylo, pm.yhi, pm.zlo, pm.zhi);
//   qthread_spawn(CAT(slave_pack_brick, SUFFIX), &pm);
//   qthread_join();
//   return offset;
// }

// size_t CAT(unpack_brick_sw, SUFFIX)(char *buf, cellgrid_t *grid, int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) {
//   vec_pack_param_t pm;
//   pm.grid = grid;
//   pm.buf = buf;
//   pm.xlo = xlo;
//   pm.xhi = xhi;
//   pm.ylo = ylo;
//   pm.yhi = yhi;
//   pm.zlo = zlo;
//   pm.zhi = zhi;
//   int xlen = xhi - xlo;
//   int ylen = yhi - ylo;
//   int zlen = zhi - zlo;

//   int npack_pe = (xlen * ylen * zlen + NPE_PACK - 1) / NPE_PACK;
//   div_magic_t div_npack_magic;
//   make_magic(&div_npack_magic, npack_pe);
//   size_t offset = 0;
  
//   for (int i = xlo; i < xhi; i ++) {
//     for (int j = ylo; j < yhi; j ++) {
//       for (int k = zlo; k < zhi; k ++) {
//         int idx = ((i - xlo) * ylen + j - ylo) * zlen + k - zlo;
//         int div = MAGIC_DIV(idx, div_npack_magic);
//         int mod = idx - div * npack_pe;
//         if (mod == 0) {
//           pm.offset[div] = offset;
//         }
//         size_t cell_size = *(long*)(buf + offset);
//         offset += cell_size;
//       }
//     }
//   }
//   // for (int i = 0; i < NPE_PACK; i ++) {
//   // printf("unpack %d %d %d %d %d %d\n", pm.xlo, pm.xhi, pm.ylo, pm.yhi, pm.zlo, pm.zhi);
//   // }
  
//   qthread_spawn(CAT(slave_unpack_brick, SUFFIX), &pm);
//   qthread_join();
//   // exit(0);
// }

size_t CAT(pack_brick_pre_sw, SUFFIX)(char *buf, cellgrid_t *grid, vec_pack_param_t *pm) {
  pm->buf = buf;
  // printf("pack %d %d %d %d %d %d\n", pm->xlo, pm->xhi, pm->ylo, pm->yhi, pm->zlo, pm->zhi);
  qthread_spawn(CAT(slave_pack_brick, SUFFIX), pm);
  qthread_join();
  // printf("%s: total: %d\n", __func__, pm->total);
  return pm->total;
}

size_t CAT(unpack_brick_pre_sw, SUFFIX)(char *buf, cellgrid_t *grid, vec_pack_param_t *pm) {
  pm->buf = buf;
  // printf("unpack %d %d %d %d %d %d\n", pm->xlo, pm->xhi, pm->ylo, pm->yhi, pm->zlo, pm->zhi);
  qthread_spawn(CAT(slave_unpack_brick, SUFFIX), pm);
  qthread_join();
  // printf("%s: total: %d\n", __func__, pm->total);
  return pm->total;
}
#endif
